In [1]:
# Ignore a bunch of deprecation warnings
import sys
sys.path.append('../..')
import warnings
warnings.filterwarnings("ignore")

import copy
import os
import time
from tqdm import tqdm
import math

import ddsp
import ddsp.training

from data_handling.ddspdataset import DDSPDataset
from utils.training_utils import print_hparams, set_seed, save_results, str2bool
from hparams_midiae_interp_cond import hparams as hp
from midiae_interp_cond.get_model import get_model, get_fake_data

import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
import pandas as pd
import qgrid

from notebook_utils import *

set_seed(1234)

# Helper Functions
sample_rate = 16000


print('Done!')
Done!
In [2]:
model_path = r'/data/ddsp-experiment/logs/5.13_samples/150000'
hp_dict = get_hp(os.path.join(os.path.dirname(model_path), 'train.log'))
for k, v in hp_dict.items():
    setattr(hp, k, v)
hp.sequence_length=1000
In [3]:
# from data_handling.urmp_tfrecord_dataloader import UrmpMidi
# from data_handling.get_tfrecord_length import get_tfrecord_length
# data_dir = r'/data/music_dataset/urmp_dataset/tfrecord_ddsp/batched/solo_instrument'
# test_data_loader = UrmpMidi(data_dir, instrument_key='vn', split='test')
# evaluation_data = test_data_loader.get_batch(batch_size=1, shuffle=True, repeats=1)

from data_handling.google_solo_inst_dataloader import GoogleSoloInstrument
test_data_loader = GoogleSoloInstrument(base_dir=r'/data/music_dataset/solo_performance_google/solo-inst_midi_features', instrument_key='sax', split='test')
evaluation_data = test_data_loader.get_batch(batch_size=1, shuffle=True, repeats=1)
In [4]:
evaluation_data = iter(evaluation_data)
In [5]:
model = get_model(hp)
_ = model._build(get_fake_data(hp))
model.load_weights(model_path)
Out[5]:
<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f7398568d10>
In [6]:
sample = next(evaluation_data)
In [7]:
from midiae_interp_cond.interpretable_conditioning import midi_to_hz, get_interpretable_conditioning, extract_harm_controls
In [8]:
plot_spec(sample['audio'][0].numpy(), sr=16000)
In [9]:
synth_params, control_params, synth_audio = model.run_synth_coder(sample, training=False)
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
In [10]:
midi_audio, params = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])

Synth-coder Prediction (ld, f0, mel -> synth params)

In [11]:
f0, amps, hd, noise = synth_params_normalized
f0_midi = ddsp.core.hz_to_midi(f0)
synth_params_normalized = (f0_midi, amps, hd, noise)
plot_pred_acoustic_feature(sample['audio'].numpy()[0], synth_audio.numpy()[0], get_synth_params(synth_params_normalized), mask_zero_f0=True)

Conditioning

The function of each conditioning:

Loudness:

  • loudness mean: overall volume of a note
  • loudness std: the extend of the volume changing (crescendo & decrescendo)
  • amplitudes_max_pos: relative position (0-1) inside a note where the amplidutes reach maximum (=0 decrescendo, =1 crescendo)

Attack:

  • attack_level: the level of note attack (the average amount of noise in the first 10 frames of each note)

Timbre:

  • brightness: controls the average timbre of a note (centroid of harmonic distribution.)

Pitch:

  • pitch variation std: control the extend of vibrato, taken from the amplitude of rfft (actually it should be called "vibrato extend", but I did not change it for compatability)
  • vibrato rate: rate of the vibrato (taken from rfft)

Conditionings are note-pooled. The conditionings of the rest notes are masked to 0.

In [12]:
sample = next(evaluation_data)
In [13]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
In [ ]:
 
In [14]:
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
qgrid_widget = qgrid.show_grid(conditioning_df, show_toolbar=True)
qgrid_widget
In [15]:
conditioning_df_changed = qgrid_widget.get_changed_df()
conditioning_dict = conditioning_df_to_dict(conditioning_df_changed, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [16]:
conditioning_df_changed
Out[16]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.187784 0.055929 0.000000 0.092456 0.070266 0.000000 0.000000 58 0 13 13
1 0.200037 0.003491 0.000000 0.084715 0.079794 0.000000 0.500000 57 14 15 1
2 0.181757 0.000000 0.000000 0.091589 0.100020 0.000000 0.000000 58 16 31 15
3 0.660257 0.082758 0.000000 0.124652 0.325231 0.000000 0.593750 58 32 63 31
4 0.598673 0.007671 0.000000 0.144210 0.464790 0.000000 0.500000 59 64 65 1
5 0.665977 0.042640 0.000000 0.114072 0.433672 0.000000 0.666667 60 66 86 20
6 0.679594 0.033672 0.000000 0.116638 0.404232 0.000000 0.666667 61 87 116 29
7 0.659117 0.030425 0.000000 0.109499 0.398962 0.000000 0.370370 60 117 143 26
8 0.674703 0.025409 0.000000 0.151181 0.384552 0.000000 0.242857 58 144 213 69
9 0.621182 0.026201 0.000000 0.158142 0.270815 0.000000 0.000000 59 214 219 5
10 0.639629 0.066777 0.413912 0.144228 0.273157 7.257257 0.177632 57 220 371 151
11 0.456946 0.014576 0.000000 0.039509 0.235051 0.000000 0.000000 54 372 376 4
12 0.698726 0.058970 0.000000 0.186284 0.283507 0.000000 0.180328 55 377 437 60
13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0 438 444 6
14 0.540217 0.045851 0.000000 0.068520 0.280882 0.000000 0.675676 50 445 481 36
15 0.473179 0.097769 0.000000 0.044180 0.274952 0.000000 0.928571 52 482 495 13
16 0.718757 0.017547 0.000000 0.220133 0.369267 0.000000 0.067797 53 496 554 58
17 0.631286 0.010678 0.000000 0.154662 0.389493 0.000000 0.000000 54 555 556 1
18 0.680718 0.060937 0.105454 0.181939 0.364396 4.004004 0.237705 55 557 678 121
19 0.517053 0.013671 0.000000 0.102532 0.383083 0.000000 0.000000 54 679 681 2
20 0.565903 0.011008 0.000000 0.117449 0.311977 0.000000 0.153846 53 682 694 12
21 0.505786 0.050207 0.000000 0.083146 0.280099 0.000000 0.000000 54 695 702 7
22 0.405598 0.012963 0.000000 0.036016 0.135055 0.000000 0.200000 55 703 707 4
23 0.354084 0.011074 0.000000 0.030930 0.176790 0.000000 0.000000 56 708 714 6
24 0.352441 0.006915 0.000000 0.033833 0.259730 0.000000 0.666667 55 715 720 5
25 0.495777 0.128361 0.000000 0.046683 0.270849 0.000000 0.944444 54 721 738 17
26 0.660683 0.048071 0.685653 0.155514 0.420273 7.507507 0.000000 55 739 828 89
27 0.411062 0.052837 0.000000 0.038148 0.153582 0.000000 0.000000 56 829 841 12
28 0.284760 0.020876 0.000000 0.043985 0.192838 0.000000 0.000000 55 842 844 2
29 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0 845 959 114
30 -0.047100 0.018558 0.000000 0.077097 -0.004628 0.000000 -96.000000 55 960 969 9
31 0.029876 0.025919 0.000000 0.075516 0.030284 0.000000 0.714286 56 970 976 6
32 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0 977 999 22

Default Value (Reconstruction)

In [17]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

ALL vibrato

In [18]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['vibrato_rate'] = np.ones_like(conditioning_df['vibrato_rate'].values)*5.25
conditioning_df['pitch_variation_std'] = np.ones_like(conditioning_df['pitch_variation_std'].values)
conditioning_df
Out[18]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.187784 0.055929 1.0 0.092456 0.070266 5.25 0.000000 58 0 13 13
1 0.200037 0.003491 1.0 0.084715 0.079794 5.25 0.500000 57 14 15 1
2 0.181757 0.000000 1.0 0.091589 0.100020 5.25 0.000000 58 16 31 15
3 0.660257 0.082758 1.0 0.124652 0.325231 5.25 0.593750 58 32 63 31
4 0.598673 0.007671 1.0 0.144210 0.464790 5.25 0.500000 59 64 65 1
5 0.665977 0.042640 1.0 0.114072 0.433672 5.25 0.666667 60 66 86 20
6 0.679594 0.033672 1.0 0.116638 0.404232 5.25 0.666667 61 87 116 29
7 0.659117 0.030425 1.0 0.109499 0.398962 5.25 0.370370 60 117 143 26
8 0.674703 0.025409 1.0 0.151181 0.384552 5.25 0.242857 58 144 213 69
9 0.621182 0.026201 1.0 0.158142 0.270815 5.25 0.000000 59 214 219 5
10 0.639629 0.066777 1.0 0.144228 0.273157 5.25 0.177632 57 220 371 151
11 0.456946 0.014576 1.0 0.039509 0.235051 5.25 0.000000 54 372 376 4
12 0.698726 0.058970 1.0 0.186284 0.283507 5.25 0.180328 55 377 437 60
13 0.000000 0.000000 1.0 0.000000 0.000000 5.25 0.000000 0 438 444 6
14 0.540217 0.045851 1.0 0.068520 0.280882 5.25 0.675676 50 445 481 36
15 0.473179 0.097769 1.0 0.044180 0.274952 5.25 0.928571 52 482 495 13
16 0.718757 0.017547 1.0 0.220133 0.369267 5.25 0.067797 53 496 554 58
17 0.631286 0.010678 1.0 0.154662 0.389493 5.25 0.000000 54 555 556 1
18 0.680718 0.060937 1.0 0.181939 0.364396 5.25 0.237705 55 557 678 121
19 0.517053 0.013671 1.0 0.102532 0.383083 5.25 0.000000 54 679 681 2
20 0.565903 0.011008 1.0 0.117449 0.311977 5.25 0.153846 53 682 694 12
21 0.505786 0.050207 1.0 0.083146 0.280099 5.25 0.000000 54 695 702 7
22 0.405598 0.012963 1.0 0.036016 0.135055 5.25 0.200000 55 703 707 4
23 0.354084 0.011074 1.0 0.030930 0.176790 5.25 0.000000 56 708 714 6
24 0.352441 0.006915 1.0 0.033833 0.259730 5.25 0.666667 55 715 720 5
25 0.495777 0.128361 1.0 0.046683 0.270849 5.25 0.944444 54 721 738 17
26 0.660683 0.048071 1.0 0.155514 0.420273 5.25 0.000000 55 739 828 89
27 0.411062 0.052837 1.0 0.038148 0.153582 5.25 0.000000 56 829 841 12
28 0.284760 0.020876 1.0 0.043985 0.192838 5.25 0.000000 55 842 844 2
29 0.000000 0.000000 1.0 0.000000 0.000000 5.25 0.000000 0 845 959 114
30 -0.047100 0.018558 1.0 0.077097 -0.004628 5.25 -96.000000 55 960 969 9
31 0.029876 0.025919 1.0 0.075516 0.030284 5.25 0.714286 56 970 976 6
32 0.000000 0.000000 1.0 0.000000 0.000000 5.25 0.000000 0 977 999 22
In [19]:
conditioning_dict = conditioning_df_to_dict(conditioning_df, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [20]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)
In [ ]:
 

None vibrato

In [21]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['vibrato_rate'] = np.zeros_like(conditioning_df['vibrato_rate'].values)
conditioning_df['pitch_variation_std'] = np.zeros_like(conditioning_df['pitch_variation_std'].values)
conditioning_df
Out[21]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.187784 0.055929 0.0 0.092456 0.070266 0.0 0.000000 58 0 13 13
1 0.200037 0.003491 0.0 0.084715 0.079794 0.0 0.500000 57 14 15 1
2 0.181757 0.000000 0.0 0.091589 0.100020 0.0 0.000000 58 16 31 15
3 0.660257 0.082758 0.0 0.124652 0.325231 0.0 0.593750 58 32 63 31
4 0.598673 0.007671 0.0 0.144210 0.464790 0.0 0.500000 59 64 65 1
5 0.665977 0.042640 0.0 0.114072 0.433672 0.0 0.666667 60 66 86 20
6 0.679594 0.033672 0.0 0.116638 0.404232 0.0 0.666667 61 87 116 29
7 0.659117 0.030425 0.0 0.109499 0.398962 0.0 0.370370 60 117 143 26
8 0.674703 0.025409 0.0 0.151181 0.384552 0.0 0.242857 58 144 213 69
9 0.621182 0.026201 0.0 0.158142 0.270815 0.0 0.000000 59 214 219 5
10 0.639629 0.066777 0.0 0.144228 0.273157 0.0 0.177632 57 220 371 151
11 0.456946 0.014576 0.0 0.039509 0.235051 0.0 0.000000 54 372 376 4
12 0.698726 0.058970 0.0 0.186284 0.283507 0.0 0.180328 55 377 437 60
13 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.000000 0 438 444 6
14 0.540217 0.045851 0.0 0.068520 0.280882 0.0 0.675676 50 445 481 36
15 0.473179 0.097769 0.0 0.044180 0.274952 0.0 0.928571 52 482 495 13
16 0.718757 0.017547 0.0 0.220133 0.369267 0.0 0.067797 53 496 554 58
17 0.631286 0.010678 0.0 0.154662 0.389493 0.0 0.000000 54 555 556 1
18 0.680718 0.060937 0.0 0.181939 0.364396 0.0 0.237705 55 557 678 121
19 0.517053 0.013671 0.0 0.102532 0.383083 0.0 0.000000 54 679 681 2
20 0.565903 0.011008 0.0 0.117449 0.311977 0.0 0.153846 53 682 694 12
21 0.505786 0.050207 0.0 0.083146 0.280099 0.0 0.000000 54 695 702 7
22 0.405598 0.012963 0.0 0.036016 0.135055 0.0 0.200000 55 703 707 4
23 0.354084 0.011074 0.0 0.030930 0.176790 0.0 0.000000 56 708 714 6
24 0.352441 0.006915 0.0 0.033833 0.259730 0.0 0.666667 55 715 720 5
25 0.495777 0.128361 0.0 0.046683 0.270849 0.0 0.944444 54 721 738 17
26 0.660683 0.048071 0.0 0.155514 0.420273 0.0 0.000000 55 739 828 89
27 0.411062 0.052837 0.0 0.038148 0.153582 0.0 0.000000 56 829 841 12
28 0.284760 0.020876 0.0 0.043985 0.192838 0.0 0.000000 55 842 844 2
29 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.000000 0 845 959 114
30 -0.047100 0.018558 0.0 0.077097 -0.004628 0.0 -96.000000 55 960 969 9
31 0.029876 0.025919 0.0 0.075516 0.030284 0.0 0.714286 56 970 976 6
32 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.000000 0 977 999 22
In [22]:
conditioning_dict = conditioning_df_to_dict(conditioning_df, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [23]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

ALL crescendo

In [24]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['amplitudes_max_pos'] = np.ones_like(conditioning_df['amplitudes_max_pos'].values)
conditioning_df['loudness_std'] = np.ones_like(conditioning_df['loudness_std'].values) * 0.15
conditioning_df
Out[24]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.187784 0.15 0.000000 0.092456 0.070266 0.000000 1.0 58 0 13 13
1 0.200037 0.15 0.000000 0.084715 0.079794 0.000000 1.0 57 14 15 1
2 0.181757 0.15 0.000000 0.091589 0.100020 0.000000 1.0 58 16 31 15
3 0.660257 0.15 0.000000 0.124652 0.325231 0.000000 1.0 58 32 63 31
4 0.598673 0.15 0.000000 0.144210 0.464790 0.000000 1.0 59 64 65 1
5 0.665977 0.15 0.000000 0.114072 0.433672 0.000000 1.0 60 66 86 20
6 0.679594 0.15 0.000000 0.116638 0.404232 0.000000 1.0 61 87 116 29
7 0.659117 0.15 0.000000 0.109499 0.398962 0.000000 1.0 60 117 143 26
8 0.674703 0.15 0.000000 0.151181 0.384552 0.000000 1.0 58 144 213 69
9 0.621182 0.15 0.000000 0.158142 0.270815 0.000000 1.0 59 214 219 5
10 0.639629 0.15 0.413912 0.144228 0.273157 7.257257 1.0 57 220 371 151
11 0.456946 0.15 0.000000 0.039509 0.235051 0.000000 1.0 54 372 376 4
12 0.698726 0.15 0.000000 0.186284 0.283507 0.000000 1.0 55 377 437 60
13 0.000000 0.15 0.000000 0.000000 0.000000 0.000000 1.0 0 438 444 6
14 0.540217 0.15 0.000000 0.068520 0.280882 0.000000 1.0 50 445 481 36
15 0.473179 0.15 0.000000 0.044180 0.274952 0.000000 1.0 52 482 495 13
16 0.718757 0.15 0.000000 0.220133 0.369267 0.000000 1.0 53 496 554 58
17 0.631286 0.15 0.000000 0.154662 0.389493 0.000000 1.0 54 555 556 1
18 0.680718 0.15 0.105454 0.181939 0.364396 4.004004 1.0 55 557 678 121
19 0.517053 0.15 0.000000 0.102532 0.383083 0.000000 1.0 54 679 681 2
20 0.565903 0.15 0.000000 0.117449 0.311977 0.000000 1.0 53 682 694 12
21 0.505786 0.15 0.000000 0.083146 0.280099 0.000000 1.0 54 695 702 7
22 0.405598 0.15 0.000000 0.036016 0.135055 0.000000 1.0 55 703 707 4
23 0.354084 0.15 0.000000 0.030930 0.176790 0.000000 1.0 56 708 714 6
24 0.352441 0.15 0.000000 0.033833 0.259730 0.000000 1.0 55 715 720 5
25 0.495777 0.15 0.000000 0.046683 0.270849 0.000000 1.0 54 721 738 17
26 0.660683 0.15 0.685653 0.155514 0.420273 7.507507 1.0 55 739 828 89
27 0.411062 0.15 0.000000 0.038148 0.153582 0.000000 1.0 56 829 841 12
28 0.284760 0.15 0.000000 0.043985 0.192838 0.000000 1.0 55 842 844 2
29 0.000000 0.15 0.000000 0.000000 0.000000 0.000000 1.0 0 845 959 114
30 -0.047100 0.15 0.000000 0.077097 -0.004628 0.000000 1.0 55 960 969 9
31 0.029876 0.15 0.000000 0.075516 0.030284 0.000000 1.0 56 970 976 6
32 0.000000 0.15 0.000000 0.000000 0.000000 0.000000 1.0 0 977 999 22
In [25]:
conditioning_dict = conditioning_df_to_dict(conditioning_df, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [26]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)

All decrescendo

In [27]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['amplitudes_max_pos'] = np.zeros_like(conditioning_df['amplitudes_max_pos'].values)
conditioning_df['loudness_std'] = np.ones_like(conditioning_df['loudness_std'].values) * 0.2
# conditioning_df['brightness'] = conditioning_df['brightness'].values * 0.8
conditioning_df
Out[27]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.187784 0.2 0.000000 0.092456 0.070266 0.000000 0.0 58 0 13 13
1 0.200037 0.2 0.000000 0.084715 0.079794 0.000000 0.0 57 14 15 1
2 0.181757 0.2 0.000000 0.091589 0.100020 0.000000 0.0 58 16 31 15
3 0.660257 0.2 0.000000 0.124652 0.325231 0.000000 0.0 58 32 63 31
4 0.598673 0.2 0.000000 0.144210 0.464790 0.000000 0.0 59 64 65 1
5 0.665977 0.2 0.000000 0.114072 0.433672 0.000000 0.0 60 66 86 20
6 0.679594 0.2 0.000000 0.116638 0.404232 0.000000 0.0 61 87 116 29
7 0.659117 0.2 0.000000 0.109499 0.398962 0.000000 0.0 60 117 143 26
8 0.674703 0.2 0.000000 0.151181 0.384552 0.000000 0.0 58 144 213 69
9 0.621182 0.2 0.000000 0.158142 0.270815 0.000000 0.0 59 214 219 5
10 0.639629 0.2 0.413912 0.144228 0.273157 7.257257 0.0 57 220 371 151
11 0.456946 0.2 0.000000 0.039509 0.235051 0.000000 0.0 54 372 376 4
12 0.698726 0.2 0.000000 0.186284 0.283507 0.000000 0.0 55 377 437 60
13 0.000000 0.2 0.000000 0.000000 0.000000 0.000000 0.0 0 438 444 6
14 0.540217 0.2 0.000000 0.068520 0.280882 0.000000 0.0 50 445 481 36
15 0.473179 0.2 0.000000 0.044180 0.274952 0.000000 0.0 52 482 495 13
16 0.718757 0.2 0.000000 0.220133 0.369267 0.000000 0.0 53 496 554 58
17 0.631286 0.2 0.000000 0.154662 0.389493 0.000000 0.0 54 555 556 1
18 0.680718 0.2 0.105454 0.181939 0.364396 4.004004 0.0 55 557 678 121
19 0.517053 0.2 0.000000 0.102532 0.383083 0.000000 0.0 54 679 681 2
20 0.565903 0.2 0.000000 0.117449 0.311977 0.000000 0.0 53 682 694 12
21 0.505786 0.2 0.000000 0.083146 0.280099 0.000000 0.0 54 695 702 7
22 0.405598 0.2 0.000000 0.036016 0.135055 0.000000 0.0 55 703 707 4
23 0.354084 0.2 0.000000 0.030930 0.176790 0.000000 0.0 56 708 714 6
24 0.352441 0.2 0.000000 0.033833 0.259730 0.000000 0.0 55 715 720 5
25 0.495777 0.2 0.000000 0.046683 0.270849 0.000000 0.0 54 721 738 17
26 0.660683 0.2 0.685653 0.155514 0.420273 7.507507 0.0 55 739 828 89
27 0.411062 0.2 0.000000 0.038148 0.153582 0.000000 0.0 56 829 841 12
28 0.284760 0.2 0.000000 0.043985 0.192838 0.000000 0.0 55 842 844 2
29 0.000000 0.2 0.000000 0.000000 0.000000 0.000000 0.0 0 845 959 114
30 -0.047100 0.2 0.000000 0.077097 -0.004628 0.000000 0.0 55 960 969 9
31 0.029876 0.2 0.000000 0.075516 0.030284 0.000000 0.0 56 970 976 6
32 0.000000 0.2 0.000000 0.000000 0.000000 0.000000 0.0 0 977 999 22
In [28]:
conditioning_dict = conditioning_df_to_dict(conditioning_df, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [29]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)
In [ ]:
 

ALL staccato

In [36]:
synth_params_normalized, midi_features, conditioning_dict = model.gen_cond_dict_from_feature(sample, training=False)
conditioning_df = conditioining_dict_to_df(conditioning_dict, sample['onsets'], sample['offsets'], sample['midi'])
conditioning_df['amplitudes_max_pos'] = np.zeros_like(conditioning_df['amplitudes_max_pos'].values)
conditioning_df['loudness_std'] = np.ones_like(conditioning_df['loudness_std'].values) * 0.2
conditioning_df['brightness'] = conditioning_df['brightness'].values * 0.45
conditioning_df
Out[36]:
loudness_mean loudness_std pitch_variation_std brightness attack_level vibrato_rate amplitudes_max_pos pitch onset offset note_length
0 0.187784 0.2 0.000000 0.041605 0.070266 0.000000 0.0 58 0 13 13
1 0.200037 0.2 0.000000 0.038122 0.079794 0.000000 0.0 57 14 15 1
2 0.181757 0.2 0.000000 0.041215 0.100020 0.000000 0.0 58 16 31 15
3 0.660257 0.2 0.000000 0.056093 0.325231 0.000000 0.0 58 32 63 31
4 0.598673 0.2 0.000000 0.064895 0.464790 0.000000 0.0 59 64 65 1
5 0.665977 0.2 0.000000 0.051332 0.433672 0.000000 0.0 60 66 86 20
6 0.679594 0.2 0.000000 0.052487 0.404232 0.000000 0.0 61 87 116 29
7 0.659117 0.2 0.000000 0.049274 0.398962 0.000000 0.0 60 117 143 26
8 0.674703 0.2 0.000000 0.068031 0.384552 0.000000 0.0 58 144 213 69
9 0.621182 0.2 0.000000 0.071164 0.270815 0.000000 0.0 59 214 219 5
10 0.639629 0.2 0.413912 0.064903 0.273157 7.257257 0.0 57 220 371 151
11 0.456946 0.2 0.000000 0.017779 0.235051 0.000000 0.0 54 372 376 4
12 0.698726 0.2 0.000000 0.083828 0.283507 0.000000 0.0 55 377 437 60
13 0.000000 0.2 0.000000 0.000000 0.000000 0.000000 0.0 0 438 444 6
14 0.540217 0.2 0.000000 0.030834 0.280882 0.000000 0.0 50 445 481 36
15 0.473179 0.2 0.000000 0.019881 0.274952 0.000000 0.0 52 482 495 13
16 0.718757 0.2 0.000000 0.099060 0.369267 0.000000 0.0 53 496 554 58
17 0.631286 0.2 0.000000 0.069598 0.389493 0.000000 0.0 54 555 556 1
18 0.680718 0.2 0.105454 0.081873 0.364396 4.004004 0.0 55 557 678 121
19 0.517053 0.2 0.000000 0.046140 0.383083 0.000000 0.0 54 679 681 2
20 0.565903 0.2 0.000000 0.052852 0.311977 0.000000 0.0 53 682 694 12
21 0.505786 0.2 0.000000 0.037416 0.280099 0.000000 0.0 54 695 702 7
22 0.405598 0.2 0.000000 0.016207 0.135055 0.000000 0.0 55 703 707 4
23 0.354084 0.2 0.000000 0.013919 0.176790 0.000000 0.0 56 708 714 6
24 0.352441 0.2 0.000000 0.015225 0.259730 0.000000 0.0 55 715 720 5
25 0.495777 0.2 0.000000 0.021007 0.270849 0.000000 0.0 54 721 738 17
26 0.660683 0.2 0.685653 0.069981 0.420273 7.507507 0.0 55 739 828 89
27 0.411062 0.2 0.000000 0.017167 0.153582 0.000000 0.0 56 829 841 12
28 0.284760 0.2 0.000000 0.019793 0.192838 0.000000 0.0 55 842 844 2
29 0.000000 0.2 0.000000 0.000000 0.000000 0.000000 0.0 0 845 959 114
30 -0.047100 0.2 0.000000 0.034694 -0.004628 0.000000 0.0 55 960 969 9
31 0.029876 0.2 0.000000 0.033982 0.030284 0.000000 0.0 56 970 976 6
32 0.000000 0.2 0.000000 0.000000 0.000000 0.000000 0.0 0 977 999 22
In [37]:
conditioning_dict = conditioning_df_to_dict(conditioning_df, length=1000)
midi_audio_changed, params_changed = model.gen_audio_from_cond_dict(conditioning_dict, midi_features, instrument_id=sample['instrument_id'])
In [38]:
plot_pred_acoustic_feature(sample['audio'].numpy()[0], midi_audio_changed.numpy()[0], get_synth_params(params_changed), mask_zero_f0=True)
In [ ]: